suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(DESeq2))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(hyperSpec))
suppressPackageStartupMessages(library(parallel))
suppressPackageStartupMessages(library(pander))
suppressPackageStartupMessages(library(plotly))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(scatterplot3d))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tximport))
suppressPackageStartupMessages(library(vsn))
suppressPackageStartupMessages(library(here))
source(here("UPSCb-common/src/R/plot.multidensity.R"))
source(here("UPSCb-common/src/R/featureSelection.R"))
pal <- brewer.pal(12,"Paired")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
samples <- read_csv("~/Git/zygoticEmbryogenesis/doc/ZE_Dataset_v3.csv",
col_types = cols(col_character(),
col_character(),
col_factor(),
col_character(),
col_factor(),
col_character(),
col_double(),
col_double())) %>%
mutate(Tissue=factor(Tissue)) %>%
mutate(Time=factor(Time))
lb.filelist <- list.files(here("data/RNA-Seq/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)
Filter and select only one duplicate from each sample.
lb.filterlist <- str_subset(lb.filelist,"L001_sortmerna_trimmomatic")
#removed P11562_112 trimmomatic dataset
lb.filterlist <- (str_subset(lb.filterlist, "/mnt/picea/home/mstewart/Git/zygoticEmbryogenesis/data/RNA-Seq/salmon/P11562_112_S11_L001_sortmerna_trimmomatic/quant.sf", negate = TRUE))
#removed P11562_112 row from sample list
samples <- filter(samples, !grepl("112",NGI.ID))
#samples <- filter(samples, !grepl("P11562_201",NGI.ID)) NOT REMOVING THIS SAMPLE ANYMORE, FIXED FROM FMG TO S
#lb.filterlist <- (str_subset(lb.filterlist, "P11562_201", negate = TRUE)) NOT REMOVING THIS SAMPLE ANYMORE, FIXED FROM FMG TO S
stopifnot(all(str_which(lb.filterlist, samples$NGI.ID) == 1:length(lb.filterlist)))
##assign names to the filtered filelist (which removed L002)
names(lb.filterlist) <- samples$NGI.ID
lb.filterlist <- lb.filterlist[samples$Tissue %in% c("ZE","FMG","S")] ##Are we only looking at ZE?
samples <- subset(samples, select = -c(User.ID, Sample.ID, Replicate, Mreads, X..Q30) )
Read the expression at the gene level (there is one transcript per gene)
lb.g <- suppressMessages(tximport(files = lb.filterlist,
type = "salmon",txOut=TRUE))
counts <- round(lb.g$counts)
Check how many genes are never expressed - reasonable level of non-expressed genes indicated.
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))
## [1] "19.5% percent (12910) of 66069 genes are not expressed"
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(lb.filterlist),samples$NGI.ID),]),
aes(x,y,col=Tissue,fill=Time)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
ggplot(tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples[match(names(lb.filterlist),samples$NGI.ID),]),
aes(x,y,col=Tissue,fill=)) + geom_col() +
scale_y_continuous(name="reads") +
theme(axis.text.x=element_text(angle=90),axis.title.x=element_blank())
Display the per-gene mean expression
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
The cumulative gene coverage is as expected
ggplot(melt(log10(rowMeans(counts))),aes(x=value)) +
geom_density() + ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)")
## Warning: Removed 12910 rows containing non-finite values (stat_density).
The same is done for the individual samples colored by condition. The gene coverage across samples is extremely similar
dat <- as.data.frame(log10(counts)) %>% utils::stack() %>%
mutate(Tissue=samples$Tissue[match(ind,samples$NGI.ID)]) %>%
mutate(Time=samples$Time[match(ind,samples$NGI.ID)])
Color by Experiment ####OBJECT TISSUE NOT FOUND############################################################
ggplot(dat,aes(x=values,group=ind,col=Tissue)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 1767086 rows containing non-finite values (stat_density).
Color by Time
ggplot(dat,aes(x=values,group=ind,col=Time)) +
geom_density() + ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)")
## Warning: Removed 1767086 rows containing non-finite values (stat_density).
dir.create(here("analysis","salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("analysis/salmon/ZE-unnormalised-gene-expression_data.csv"))
############## change export location name
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate
dds <- DESeqDataSetFromMatrix(
countData = counts,
colData = samples,
design = ~ Time - Tissue)
## converting counts to integer mode
##############################does not like *, + or /, will only take - as the model.
save(dds,file=here("analysis/salmon/ZE-Dataset-dds.rda"))
Check the size factors (i.e. the sequencing library size effect)
The sequencing depth is relatively variable (0 to 200 %) however the low end of the variance is likely due to poor samples where little sequencing data was actually produce that wasnt 16s bacterial.
dds <- estimateSizeFactors(dds)
sizes <- sizeFactors(dds)
pander(sizes)
| P11562_101 | P11562_102 | P11562_103 | P11562_104 | P11562_105 | P11562_106 |
|---|---|---|---|---|---|
| 1.15 | 0.6652 | 0.6665 | 0.9087 | 0.8328 | 1.123 |
| P11562_107 | P11562_108 | P11562_110 | P11562_111 | P11562_113 | P11562_114 |
|---|---|---|---|---|---|
| 0.9761 | 0.7257 | 0.5128 | 1.581 | 1.415 | 0.6632 |
| P11562_115 | P11562_116 | P11562_117 | P11562_118 | P11562_119 | P11562_120 |
|---|---|---|---|---|---|
| 1 | 0.2937 | 1.191 | 0.5468 | 1.041 | 0.7201 |
| P11562_121 | P11562_123 | P11562_124 | P11562_125 | P11562_126 | P11562_127 |
|---|---|---|---|---|---|
| 1.332 | 1.122 | 0.4229 | 1.206 | 0.5732 | 1.553 |
| P11562_128 | P11562_129 | P11562_130 | P11562_131 | P11562_133 | P11562_134 |
|---|---|---|---|---|---|
| 1.097 | 1.243 | 1.113 | 1.23 | 1.754 | 1.175 |
| P11562_135 | P11562_136 | P11562_137 | P11562_138 | P11562_139 | P11562_140 |
|---|---|---|---|---|---|
| 1.565 | 1.276 | 1.076 | 0.8371 | 1.088 | 1.05 |
| P11562_141 | P11562_142 | P11562_143 | P11562_145 | P11562_146 | P11562_147 |
|---|---|---|---|---|---|
| 1.742 | 1.592 | 0.9982 | 1.499 | 0.908 | 1.003 |
| P11562_148 | P11562_149 | P11562_150 | P11562_151 | P11562_153 | P11562_154 |
|---|---|---|---|---|---|
| 1.767 | 1.052 | 1.634 | 1.285 | 0.5884 | 1.082 |
| P11562_155 | P11562_157 | P11562_201 | P11562_202 | P11562_203 | P11562_204 |
|---|---|---|---|---|---|
| 1.781 | 1.234 | 0.7672 | 1.11 | 1.309 | 1.103 |
| P11562_205 | P11562_206 |
|---|---|
| 1.106 | 0.6848 |
boxplot(sizes, main="Sequencing libraries size factor")
vsd <- varianceStabilizingTransformation(dds, blind=FALSE)
vst <- assay(vsd)
vst <- vst - min(vst)
The variance stabilisation worked very well, the data is nearly homoscesdastic
meanSdPlot(vst[rowSums(counts)>0,])
pc <- prcomp(t(vst))
percent <- round(summary(pc)$importance[2,]*100)
Seems that different time points form small clusters and ZE and FMG tissue types appear to separate. These are Comp1 and Comp2 which explains the different between most of the sampels except for one B4 ZE sample. Appears to be an outlier. This seems to indicate that the Tissue and Time components explain the difference between samples.
mar=c(5.1,4.1,4.1,2.1)
scatterplot3d(pc$x[,1],
pc$x[,2],
pc$x[,3],
xlab=paste("Comp. 1 (",percent[1],"%)",sep=""),
ylab=paste("Comp. 2 (",percent[2],"%)",sep=""),
zlab=paste("Comp. 3 (",percent[3],"%)",sep=""),
color=pal[as.integer(samples$Time)],
pch=c(17,19,15)[as.integer(samples$Tissue)])
legend("topleft",
fill=pal[1:nlevels(samples$Time)],
legend=levels(samples$Time))
legend("bottomright",
pch=c(17,19,15),
legend=levels(samples$Tissue))
par(mar=mar)
pc.dat <- bind_cols(PC1=pc$x[,1],
PC2=pc$x[,2],
samples)
ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts") +
scale_x_continuous(name=element_text(paste("PC1 (",percent[1],"%)",sep=""))) +
scale_y_continuous(name=element_text(paste("PC2 (",percent[2],"%)",sep="")))
suppressPackageStartupMessages(library(plotly))
interplot <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Time,shape=Tissue,text=NGI.ID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(interplot, tooltip = "all") %>% layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep="")))
Filter for noise A cutoff at a VST value of 1 leaves about 32000 genes - is this adequate for the QA?
conds <- factor(paste(samples$Tissue,samples$Time))
sels <- rangeFeatureSelect(counts=vst,
conditions=conds,
nrep=3)
## Warning in xy.coords(x, y, xlabel, ylabel, log): 1 y value <= 0 omitted
## from logarithmic plot
vstCutoff <- 4+1
heatmap.2(t(scale(t(vst[sels[[vstCutoff]],]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[vstCutoff]],]),decreasing=TRUE) [1:1000]
Cluster of subset compared to overall appears to be similar in shape with a shift in increased VST expression and slightly more spread out density. These most variable genes appear to have higher levels of VST expression.
ggplot(list(sub=rowMeans(vst[sels[[vstCutoff]],][ord,]),
total=rowMeans(vst[sels[[vstCutoff]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
Variance in expression in different tissues can be seen. Certain tissues samples appear the same as in other tissues. It appears that tissues have different expression patterns. It can also be seen that over different time points, that gene expression between the samples change slightly over time.
heatmap.2(t(scale(t(vst[sels[[vstCutoff]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
ord <- order(rowSds(vst[sels[[vstCutoff]],])) [1:1000]
The subset is enriched for two subsets of genes - ones which have relatively the same level of VST expression as the total gene subset but also a much higher population of more highly VST expressed genes.
ggplot(list(sub=rowMeans(vst[sels[[vstCutoff]],][ord,]),
total=rowMeans(vst[sels[[vstCutoff]],])) %>% melt(),
aes(x=value,col=L1)) +
geom_density() +
ggtitle("Density of the subset vs. overall") +
scale_x_continuous(name=element_text("VST expression")) +
theme(legend.title=element_blank())
The clustering for the least variable genes shows a very small change in separation of gene expression by Tissue and Time. There is a more stark difference in the Somatic samples.
heatmap.2(t(scale(t(vst[sels[[vstCutoff]],][ord,]))),
distfun=pearson.dist,
hclustfun=function(X){hclust(X,method="ward.D2")},
labRow = NA,trace = "none",
labCol = conds,
col=hpal)
The data appears to show a correlation between Tissue and Time, with some clustering in the PCA plot showing a movement of time in ascending order from right to left (+ to -) along the X axis. There can also be seen that along the Y axis, there are subgroups which correlate to the Tissue type. Along with this, it appears that overall gene expression does not appear starkly different but slight differences can be observed between Tissue types. It appears that the most difference between Tissues can be seen in the 1000 most variable genes, with some small changes between Time points within those Tissue groups. The final heat map of the 1000 least variable genes appear to show a very mixed expression pattern, similar in appearance to the total gene heat map, between Tissue groups and Time points, except for in the Somatic tissue type.
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] here_0.1 vsn_3.52.0
## [3] tximport_1.12.3 forcats_0.4.0
## [5] stringr_1.4.0 dplyr_0.8.3
## [7] purrr_0.3.2 readr_1.3.1
## [9] tidyr_0.8.3 tibble_2.1.3
## [11] tidyverse_1.2.1 scatterplot3d_0.3-41
## [13] RColorBrewer_1.1-2 plotly_4.9.0
## [15] pander_0.6.3 hyperSpec_0.99-20180627
## [17] ggplot2_3.2.1 lattice_0.20-38
## [19] gplots_3.0.1.1 DESeq2_1.24.0
## [21] SummarizedExperiment_1.14.1 DelayedArray_0.10.0
## [23] BiocParallel_1.18.1 matrixStats_0.54.0
## [25] Biobase_2.44.0 GenomicRanges_1.36.0
## [27] GenomeInfoDb_1.20.0 IRanges_2.18.2
## [29] S4Vectors_0.22.0 BiocGenerics_0.30.0
## [31] data.table_1.12.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 rprojroot_1.3-2 htmlTable_1.13.1
## [4] XVector_0.24.0 base64enc_0.1-3 rstudioapi_0.10
## [7] hexbin_1.27.3 affyio_1.54.0 bit64_0.9-7
## [10] AnnotationDbi_1.46.1 lubridate_1.7.4 xml2_1.2.2
## [13] splines_3.6.1 geneplotter_1.62.0 knitr_1.24
## [16] zeallot_0.1.0 Formula_1.2-3 jsonlite_1.6
## [19] broom_0.5.2 annotate_1.62.0 cluster_2.1.0
## [22] shiny_1.3.2 BiocManager_1.30.4 compiler_3.6.1
## [25] httr_1.4.1 backports_1.1.4 assertthat_0.2.1
## [28] Matrix_1.2-17 lazyeval_0.2.2 limma_3.40.6
## [31] cli_1.1.0 later_0.8.0 acepack_1.4.1
## [34] htmltools_0.3.6 tools_3.6.1 gtable_0.3.0
## [37] glue_1.3.1 GenomeInfoDbData_1.2.1 affy_1.62.0
## [40] reshape2_1.4.3 Rcpp_1.0.2 cellranger_1.1.0
## [43] vctrs_0.2.0 preprocessCore_1.46.0 gdata_2.18.0
## [46] nlme_3.1-141 crosstalk_1.0.0 xfun_0.9
## [49] testthat_2.2.1 rvest_0.3.4 mime_0.7
## [52] gtools_3.8.1 XML_3.98-1.20 zlibbioc_1.30.0
## [55] scales_1.0.0 promises_1.0.1 hms_0.5.1
## [58] yaml_2.2.0 memoise_1.1.0 gridExtra_2.3
## [61] rpart_4.1-15 latticeExtra_0.6-28 stringi_1.4.3
## [64] RSQLite_2.1.2 highr_0.8 genefilter_1.66.0
## [67] checkmate_1.9.4 caTools_1.17.1.2 rlang_0.4.0
## [70] pkgconfig_2.0.2 bitops_1.0-6 evaluate_0.14
## [73] labeling_0.3 htmlwidgets_1.3 bit_1.1-14
## [76] tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5
## [79] R6_2.4.0 generics_0.0.2 Hmisc_4.2-0
## [82] DBI_1.0.0 pillar_1.4.2 haven_2.1.1
## [85] foreign_0.8-72 withr_2.1.2 survival_2.44-1.1
## [88] RCurl_1.95-4.12 nnet_7.3-12 modelr_0.1.5
## [91] crayon_1.3.4 KernSmooth_2.23-15 rmarkdown_1.15
## [94] locfit_1.5-9.1 readxl_1.3.1 blob_1.2.0
## [97] digest_0.6.20 xtable_1.8-4 httpuv_1.5.1
## [100] munsell_0.5.0 viridisLite_0.3.0